from __future__ import print_function
import os.path
import dalmatian as dm
import pandas as pd
import sys
sys.path.insert(0, '../../')
#import Datanalytics as da
from JKBio import TerraFunction as terra
%load_ext autoreload
%autoreload 2
from JKBio import Helper as h
import pickle
from taigapy import TaigaClient
tc = TaigaClient()
import numpy as np
import itertools
from bokeh.plotting import *
from bokeh.models import HoverTool
output_notebook()
import matplotlib.pyplot as plt
import seaborn as sns
import gseapy
from JKBio.helper import pyDESeq2
from sklearn.neighbors import KNeighborsClassifier
from sklearn.cluster import AgglomerativeClustering
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
! gsutil mv gs://transfer-amlproject/*MP7624* gs://transfer-amlproject/RNPv2/
! gsutil -m cp -r gs://transfer-amlproject/RNPv2/ gs://amlproject/
! gsutil ls gs://amlproject/
sampleset='MAX_AML_RNPv2'
! gsutil -m cp -r gs://amlproject/RNPv2/ ../../data
! bwa index -a bwtsw ../data/ERCC92/ERCC92.fa
! samtools faidx ../data/ERCC92/ERCC92.fa
from JKBio import Helper as h
! ../../TrimGalore-0.6.5/trim_galore
ls -alh res
h.getSpikeInControlScales('../data/ERCC92/ERCC92.fa', fastQfolder='res/', mapper='bwa', pairedEnd=True, cores=10, pathtosam='samtools', pathtotrim_galore='../../TrimGalore-0.6.5/trim_galore', pathtobwa='bwa',totrim=False, tomap=True, tofilter=True, results='res/', toremove=True)
terra.uploadFromFolder('amlproject','RNPv2/',
'broad-firecloud-ccle/hg38_RNAseq',samplesetname=sampleset,
fformat="fastqR1R2", sep='_MP7624')
wm = dm.WorkspaceManager('broad-firecloud-ccle/hg38_RNAseq')
star = wm.get_config("star_v1-0_BETA_cfg")
star
submission_id = wm.create_submission(star['name'], sampleset, 'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_v1-0_BETA_cfg",
sampleset,'sample_set',expression='this.samples')
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
submission_id = wm.create_submission("rsem_aggregate_results_v1-0_BETA_cfg",
sampleset)
terra.waitForSubmission('broad-firecloud-ccle/hg38_RNAseq', submission_id)
results = wm.get_sample_sets().loc[sampleset]
rsem_genes_expected_count = results['rsem_genes_expected_count']
results
mkdir ../../data/RNPv2
! gsutil cp $rsem_genes_expected_count ../../data/RNPv2/
file = '../../data/RNPv2/'+rsem_genes_expected_count.split('/')[-1]
file
! gunzip $file
rsem_genes_expected_count = pd.read_csv(file[:-3], sep='\t')
data = rsem_genes_expected_count.drop("transcript_id(s)",1)
data["gene_id"] = h.convertGenes(data['gene_id'])[0]
data=data.set_index('gene_id')
data
rename = {"1": "mr120-MV411-RNP_IRF2BP2-r4",
"2": "mr121-MV411-RNP_IRF2BP2-r5",
"3": "mr122-MV411-RNP_IRF2BP2-r6",
"4": "mr123-MV411-RNP_IRF8-r4",
"5": "mr124-MV411-RNP_IRF8-r5",
"6": "mr125-MV411-RNP_IRF8-r6",
"7": "mr126-MV411-RNP_MEF2D-r4",
"8": "mr127-MV411-RNP_MEF2D-r5",
"9": "mr128-MV411-RNP_MEF2D-r6",
"10": "mr129-MV411-RNP_MYC-r4",
"11": "mr130-MV411-RNP_MYC-r5",
"12": "mr131-MV411-RNP_MYC-r6",
"13": "mr132-MV411-RNP_RUNX1-r4",
"14": "mr133-MV411-RNP_RUNX1-r5",
"15": "mr134-MV411-RNP_RUNX1-r6",
"16": "mr135-MV411-RNP_RUNX2-r4",
"17": "mr136-MV411-RNP_RUNX2-r5",
"18": "mr137-MV411-RNP_RUNX2-r6",
"19": "mr138-MV411-RNP_SPI1-r4",
"20": "mr139-MV411-RNP_SPI1-r5",
"21": "mr140-MV411-RNP_SPI1-r6",
"22": "mr141-MV411-RNP_ZMYND8-r4",
"23": "mr142-MV411-RNP_ZMYND8-r5",
"24": "mr143-MV411-RNP_ZMYND8-r6",
"25": "mr144-MV411-RNP_LMO2-r4",
"26": "mr145-MV411-RNP_LMO2-r5",
"27": "mr146-MV411-RNP_LMO2-r6",
"28": "mr147-MV411-RNP_LYL1-r4",
"29": "mr148-MV411-RNP_LYL1-r5",
"30": "mr149-MV411-RNP_LYL1-r6",
"31": "mr150-MV411-RNP_MAX-r4",
"32": "mr151-MV411-RNP_MAX-r5",
"33": "mr152-MV411-RNP_MAX-r6",
"34": "mr153-MV411-RNP_ZEB2-r4",
"35": "mr154-MV411-RNP_ZEB2-r5",
"36": "mr155-MV411-RNP_ZEB2-r6",
"37": "mr156-MV411-RNP_MEF2C-r4",
"38": "mr157-MV411-RNP_MEF2C-r5",
"39": "mr158-MV411-RNP_MEF2C-r6",
"40": "mr159-MV411-RNP_MEIS1-r4",
"41": "mr160-MV411-RNP_MEIS1-r5",
"42": "mr161-MV411-RNP_MEIS1-r6",
"43": "mr162-MV411-RNP_FLI1-r4",
"44": "mr163-MV411-RNP_FLI1-r5",
"45": "mr164-MV411-RNP_FLI1-r6",
"46": "mr165-MV411-RNP_ELF2-r4",
"47": "mr166-MV411-RNP_ELF2-r5",
"48": "mr167-MV411-RNP_ELF2-r6",
"49": "mr168-MV411-RNP_GFI1-r4",
"50": "mr169-MV411-RNP_GFI1-r5",
"51": "mr170-MV411-RNP_GFI1-r6",
"52": "mr171-MV411-RNP_IKZF1-r4",
"53": "mr172-MV411-RNP_IKZF1-r5",
"54": "mr173-MV411-RNP_IKZF1-r6",
"55": "mr174-MV411-RNP_CEBPA-r4",
"56": "mr175-MV411-RNP_CEBPA-r5",
"57": "mr176-MV411-RNP_CEBPA-r6",
"58": "mr177-MV411-RNP_MYB-r4",
"59": "mr178-MV411-RNP_MYB-r5",
"60": "mr179-MV411-RNP_MYB-r6",
"61": "mr180-MV411-RNP_MYBL2-r1",
"62": "mr181-MV411-RNP_MYBL2-r2",
"63": "mr182-MV411-RNP_MYBL2-r3",
"64": "mr183-MV411-RNP_HOXA9-r4",
"65": "mr184-MV411-RNP_HOXA9-r5",
"66": "mr185-MV411-RNP_HOXA9-r6",
"67": "mr186-MV411-RNP_AAVS1-r1",
"68": "mr187-MV411-RNP_AAVS1-r2",
"69": "mr188-MV411-RNP_AAVS1-r3",
"70": "mr189-MV411-RNP_SP1-r4",
"71": "mr190-MV411-RNP_SP1-r5",
"72": "mr191-MV411-RNP_SP1-r6",
"73": "mr192-MV411-RNP_SP1-r7"}
data.columns = [rename[i.split('_')[1]] for i in data.columns]
data
filter some more
toremove = np.argwhere(data.values.var(1)==0)
toremove.ravel()
toremove.shape
data = data.drop(data.iloc[toremove.ravel()].index,0)
data.shape
renormalize the data
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
genenames = data.index
ctfpos = [i for i, val in enumerate(genenames) if val in ctf]
notctfpos = [i for i, val in enumerate(genenames) if val not in ctf]
We find a CTF not in the dataset
[val for val in ctf if val not in genenames]
ctf.remove('IKAROS')
data = data.reset_index(drop=True)
experiments = list(set([i.split('-')[2] for i in data.columns[:-1]]))
experiments.remove("RNP_AAVS1")
for val in experiments:
design = pd.DataFrame(index=data.columns[:-1], columns=['DMSO','Target'],
data=np.array([[1 if 'RNP_AAVS1' in i else 0 for i in data.columns[:-1]],[1 if val in i else 0 for i in data.columns[:-1]]]).T)
design.index = design.index.astype(str).str.replace('-','.')
deseq = pyDESeq2.pyDESeq2(count_matrix=data, design_matrix = design,
design_formula='~DMSO + Target', gene_column="gene_id")
deseq.run_deseq()
deseq.get_deseq_result()
r = deseq.deseq_result
r.pvalue = np.nan_to_num(np.array(r.pvalue), 1)
r.log2FoldChange = np.nan_to_num(np.array(r.log2FoldChange), 0)
results[val] = r
results.pvalue = np.nan_to_num(np.array(results.pvalue), 1)
results.log2FoldChange = np.nan_to_num(np.array(results.log2FoldChange), 0)
results.gene_id = convertGenes(results.gene_id)[0]
show(volcano(results,tohighlight=ctf))
results
for val in experiments:
a = h.volcano(results[val],tohighlight=ctf,title=val, maxvalue= 60, searchbox=True, minlogfold=0.5)
try:
show(a)
except RuntimeError:
show(a)
datad = data
data = data.drop(columns='mr129-MV411-RNP_MYC-r4')
col = {v:i for i, v in enumerate(set([i.split('-')[2] for i in data.columns[:-1]]))}
red = PCA(2).fit_transform(data[data.columns[:-1]].T)
h.scatter(red, labels=data.columns[:-1], radi=60000, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
red = PCA(30).fit_transform(data[data.columns[:-1]].T)
red = TSNE(2,4).fit_transform(red)
red.shape
mr129-MYC-r4 seems weird
h.scatter(red, labels=data.columns[:-1], radi=10, colors=[col[i.split('-')[2]] for i in data.columns[:-1]])
pca = PCA(20)
red = pca.fit_transform(data.T)
pca.explained_variance_ratio_
res = {}
data = datad
totest
data = data.set_index('gene_id',drop=True)
res[val]
for val in experiments:
print(val)
totest = data[[v for v in data.columns[:-1] if val in v or 'AAVS1' in v]]
cls = ['Condition' if val in v else 'DMSO' for v in totest.columns]
res[val] = gseapy.gsea(data=totest, gene_sets='WikiPathways_2013',
cls= cls, no_plot=False, processes=10)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
with open('../data/wikipathway_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('../data/wikipathway_RNPv2','rb') as f:
res = pickle.load(f)
for i, val in enumerate(experiments):
plt.figure(i)
res[val].res2d.Term = [i[2:-13] for i in res[val].res2d.index]
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
a = set()
for k, val in res.items():
a.update(set(val.res2d.index))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[i][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
model = AgglomerativeClustering(n_clusters=6,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(res)
ii = itertools.count(res.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True)
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
fig.savefig("enriched_terms.png")
show(a)
fi
experiments
data
res = {}
for i, val in enumerate(experiments):
print(val)
totest = data[[v for v in data.columns[:-1] if val in v or 'AAVS1' in v]]
cls = ['Condition' if val in v else 'DMSO' for v in totest.columns]
res[val] = gseapy.gsea(data=totest, gene_sets='GO_Biological_Process_2015',
cls= cls, no_plot=False, processes=14)
res[val].res2d['Term'] = [i for i in res[val].res2d.index]
plt.figure(i)
sns.barplot(data=res[val].res2d.iloc[:25], x="es", y="Term",
hue_order="geneset_size").set_title(val)
with open('../data/GO_Biological_Process_2015_RNPv2', 'wb') as f:
pickle.dump(res,f)
with open('GO_Biological_Process_2015','rb') as f:
res = pickle.load(f)
creating matrices
a = set()
for k, val in res.items():
a.update(set(val.res2d.Term))
a = {i:[0]*len(res) for i in a}
for n,(k, val) in enumerate(res.items()):
for i,v in val.res2d.iterrows():
a[v.Term][n] = v.es
res = pd.DataFrame(a, index=res.keys())
fig, ax = plt.subplots(figsize=(20,15))
sns.heatmap(ax=ax,data=res)
model = AgglomerativeClustering(n_clusters=8,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(res)
ii = itertools.count(res.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = h.plotCorrelationMatrix(res.values[sort],res.index[sort].tolist(),interactive=True,title="RNP2_bioproc_corr")
cluster1= ['LMO2','LYL1','MAX','MEF2C']
cluster2=['GFI1','FLI1','MYB','IKZF1','ELF2','CEBPa','MEIS1']
cluster3=['IRF2BP2','MEF2C','CDK6','MEF2D','IRF8','BRD4','MYC']
cluster4= ['RUNX1','RUNX2','ZMYND8']
res.loc[cluster2].mean().sort_values()
'GO_Molecular_Function_2015',
'GeneSigDB',
'ENCODE_TF_ChIP-seq_2014',
#'Drug_Perturbations_from_GEO_2014',
'GO_Cellular_Component_2015',
'GO_Biological_Process_2015',
'PPI_Hub_Proteins',
'WikiPathways_2013',
'TF-LOF_Expression_from_GEO',
# msig db C2 C6 H http://software.broadinstitute.org/gsea/msigdb/annotate.jsp
# max's crc
ctf = [
'BRD4',
'CDK6',
'CEBPA',
'ELF2',
'FLI1',
'GFI1',
'IKZF1',
'IRF2BP2',
'IRF8',
'LMO2',
'LYL1',
'MAX',
'MEF2C',
'MEF2D',
'MEIS1',
'MYB',
'MYC',
'RUNX1',
'RUNX2',
'SPI1',
'ZEB2',
'ZMYND8'
]
deseq = pd.DataFrame()
for k, val in results.items():
deseq[k] = val.log2FoldChange
deseq=deseq.T
deseq
a = plotCorrelationMatrix(a, deseq.index[sort].tolist(),interactive=True)
ctf[11] = 'CEBPa'
ctf[]
ctf
dropping ETV6 SP1 GSE1 LDB1
deseq.loc[['MYC',
'MYB',
'SPI1',
'RUNX1',
'IRF2BP2',
'FLI1',
'ELF2',
'ZEB2',
'GFI1',
'LMO2',
'CEBPa',
'MEF2D',
'MEF2C',
'IRF8',
'MEIS1',
'RUNX2',
'RUNX2',
'ZMYND8']]
show(a)
deseq_ctf = deseq.loc[['MYC',
'MYB',
'SPI1',
'RUNX1',
'IRF2BP2',
'FLI1',
'ELF2',
'ZEB2',
'GFI1',
'LMO2',
'CEBPa',
'MEF2D',
'MEF2C',
'IRF8',
'MEIS1',
'RUNX2',
'ZMYND8']]
model = AgglomerativeClustering(n_clusters=7,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(deseq_ctf)
ii = itertools.count(deseq_ctf.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = deseq_ctf.values[sort]
a = plotCorrelationMatrix(a, deseq_ctf.index[sort].tolist(),interactive=True)
show(a)
model = AgglomerativeClustering(n_clusters=7,linkage="average",
affinity="cosine", compute_full_tree=True)
labels = model.fit_predict(deseq)
ii = itertools.count(deseq.shape[0])
tree = [{'node_id': next(ii), 'left': x[0], 'right':x[1]} for x in model.children_]
sort = labels.argsort()
a = deseq.values[sort]
a = plotCorrelationMatrix(a, deseq.index[sort].tolist(),interactive=True)
show(a)
tsne, pca, clustering accross TF, CRC, most var genes, both ways.